knitr::opts_chunk$set(tidy=TRUE, highlight=TRUE, dev="png",
cache=TRUE, highlight=TRUE, autodep=TRUE, warning=FALSE, error=FALSE,
message=FALSE, prompt=TRUE, comment='', fig.cap='')
> library(ggplot2)
> library(dplyr)
> library(tidyr)
> library(biomaRt)
> library(pheatmap)
This is looking at a first sequencing run of single cells. There are three samples, H1, H2 and H3. There is some prior knowledge that the H1 sample might have some quality problems.
For a single-cell experiment, we expect to see a set of barcodes that come from real cells and then a drop. Below we show the barcode distribution for a published experiment on K562 cells and then each of these samples. We look at only the top 100,000 barcodes for each sample.
If you click on the tabs you can see each of the charts. We can see that the samples are missing the characteristic dip in the barcode distribution. Normally we would use this dip to determine a cutoff to filter out real cells.
This may be due to there not being enough depth of sequencing or it may be due to the samples having something wrong with them. It would be helpful to ask the sequencing core what they think about this.
> samples = c("H1", "H2", "H3")
> barcode_files = file.path("umis", samples, "cb-histogram.txt")
> K562_bc = read.table("metadata/k562-histogram.txt", header = FALSE, sep = "\t",
+ nrows = 1e+05)
> colnames(K562_bc) = c("barcode", "count")
> K562_bc$rank = as.numeric(rownames(K562_bc))
> ggplot(K562_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("K562 barcode count distribution")
> K562_small = read.table("metadata/k562-histogram-60million.txt", header = FALSE,
+ sep = "\t", nrows = 1e+05)
> colnames(K562_small) = c("barcode", "count")
> K562_small$rank = as.numeric(rownames(K562_small))
> ggplot(K562_small, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("K562 barcode count distribution")
> h1_bc = read.table(barcode_files[1], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h1_bc) = c("barcode", "count")
> h1_bc$rank = as.numeric(rownames(h1_bc))
> ggplot(h1_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("H1 barcode count distribution")
> h2_bc = read.table(barcode_files[2], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h2_bc) = c("barcode", "count")
> h2_bc$rank = as.numeric(rownames(h2_bc))
> ggplot(h2_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("H2 barcode count distribution")
> h3_bc = read.table(barcode_files[3], header = FALSE, sep = "\t", nrows = 1e+05)
> colnames(h3_bc) = c("barcode", "count")
> h3_bc$rank = as.numeric(rownames(h3_bc))
> ggplot(h3_bc, aes(rank, count)) + scale_x_log10() + scale_y_log10() + geom_point() +
+ theme_bw() + ggtitle("H3 barcode count distribution")
This is an R port of Allon’s barcode plots. The matlab command:
[fLog xLog] = hist(log10(counts(counts>0)),50);
y = fLog.*(10.^xLog)/sum(fLog.*(10.^xLog));
plot(xLog,y*100,'linewidth',2)
hist from matlab returns a tuple of counts and centers. R returns a dataframe with mids and counts as columns. So, the R version of this plot is:
> allon_barcode_plot = function(bcs, sample) {
+ bcs_hist = hist(log10(bcs$count), plot = FALSE, n = 50)
+ fLog = bcs_hist$count
+ xLog = bcs_hist$mids
+ y = fLog * (10^xLog)/sum(fLog * (10^xLog))
+ print(qplot(xLog, y) + geom_point() + theme_bw() + ggtitle(sample))
+ return(data.frame(x = xLog, y = y, sample = sample))
+ }
This results in plots where it is easier to call a peak. It looks like we can set a rough cutoff of around 10^4 for the H1 and H3 samples. H2 doesn’t have a clear peak but 10^4 might be a reasonable cutoff.
> k562_allon = allon_barcode_plot(K562_small, "K562")
> h1_allon = allon_barcode_plot(h1_bc, "H1")
> h2_allon = allon_barcode_plot(h2_bc, "H2")
> h3_allon = allon_barcode_plot(h3_bc, "H3")
> vals = rbind(k562_allon, h1_allon, h2_allon, h3_allon)
> ggplot(vals, aes(x, y, color = sample)) + geom_point() + theme_bw()
Here we compare the counts for the K562 data as well as for each of the proximal tubule samples, using several different metrics. We counted at the level of the transcript, so before continuing we will collapse the transcript level counts to gene level counts by summing the counts for each transcript for each gene.
> mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "mmusculus_gene_ensembl",
+ host = "jul2015.archive.ensembl.org")
> conversions = getBM(attributes = c("ensembl_gene_id", "refseq_mrna", "ensembl_transcript_id",
+ "mgi_symbol"), mart = mart)
> load_counts = function(count_file) {
+ counts = read.table(count_file, header = TRUE, sep = ",")
+ counts = counts %>% tidyr::separate(gene, into = c("transcript", "version"),
+ sep = "\\.", fill = "right") %>% dplyr::left_join(conversions, c(transcript = "ensembl_transcript_id")) %>%
+ dplyr::filter(mgi_symbol != "") %>% dplyr::select(-transcript, -version,
+ -ensembl_gene_id, -refseq_mrna) %>% tidyr::gather(cell, count, -mgi_symbol) %>%
+ dplyr::group_by(cell, mgi_symbol) %>% dplyr::summarise(counts = sum(count)) %>%
+ tidyr::spread(cell, counts, fill = 0)
+ counts = as.data.frame(counts)
+ rownames(counts) = counts$mgi_symbol
+ counts$mgi_symbol = NULL
+ return(counts)
+ }
> load_bcbio_refseq = function(count_file) {
+ counts = read.table(count_file, header = TRUE, sep = ",")
+ counts = counts %>% dplyr::left_join(conversions, c(gene = "refseq_mrna")) %>%
+ dplyr::filter(mgi_symbol != "") %>% dplyr::select(-gene, -ensembl_gene_id,
+ -ensembl_transcript_id) %>% tidyr::gather(cell, count, -mgi_symbol) %>%
+ dplyr::group_by(cell, mgi_symbol) %>% dplyr::summarise(counts = sum(count)) %>%
+ tidyr::spread(cell, counts, fill = 0)
+ counts = as.data.frame(counts)
+ rownames(counts) = counts$mgi_symbol
+ counts$mgi_symbol = NULL
+ return(counts)
+ }
> mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl",
+ host = "jul2015.archive.ensembl.org")
> hconversions = getBM(attributes = c("ensembl_gene_id", "refseq_mrna", "ensembl_transcript_id",
+ "hgnc_symbol"), mart = mart)
> load_hcounts = function(count_file) {
+ counts = read.table(count_file, header = TRUE, sep = ",")
+ counts = counts %>% dplyr::left_join(hconversions, c(gene = "ensembl_transcript_id")) %>%
+ dplyr::filter(hgnc_symbol != "") %>% dplyr::select(-gene, -ensembl_gene_id,
+ -refseq_mrna) %>% tidyr::gather(cell, count, -hgnc_symbol, -hgnc_symbol) %>%
+ dplyr::group_by(cell, hgnc_symbol) %>% dplyr::summarise(counts = sum(count)) %>%
+ tidyr::spread(cell, counts, fill = 0)
+ counts = as.data.frame(counts)
+ rownames(counts) = counts$hgnc_symbol
+ counts$hgnc_symbol = NULL
+ return(counts)
+ }
> load_k562 = function(count_file) {
+ k562 = read.table(count_file, header = TRUE, sep = ",") %>% tidyr::separate(gene,
+ into = c("transcript", "symbol"), sep = "\\|") %>% dplyr::select(-transcript) %>%
+ tidyr::gather(cell, count, -symbol) %>% dplyr::group_by(cell, symbol) %>%
+ dplyr::summarise(counts = sum(count)) %>% tidyr::spread(cell, counts,
+ fill = 0)
+ k562 = as.data.frame(k562)
+ rownames(k562) = k562$symbol
+ k562$symbol = NULL
+ k562 = k562[!grepl("ERCC", rownames(k562)), ]
+ return(k562)
+ }
> count_files = file.path("no-ncrna", samples, paste(samples, ".counts", sep = ""))
> h1 = load_counts(count_files[1])
> h2 = load_counts(count_files[2])
> h3 = load_counts(count_files[3])
> count_files = file.path("new-counts", "umis", samples, paste(samples, ".counts",
+ sep = ""))
> h1_refseq = load_bcbio_refseq(count_files[1])
> h2_refseq = load_bcbio_refseq(count_files[2])
> h3_refseq = load_bcbio_refseq(count_files[3])
> k562_grch37 = load_hcounts("metadata/k562-GRCh37.counts")
> k562_refseq = load_k562("metadata/k562-refseq.counts")
Based on Allon’s plots it looks like 10^4 is the cutoff for the H1-H3 samples. We already used that cutoff when we were deciding which cells to count so we won’t do any further filtering. We need to filter the K562 samples to pick out their peak, which is more like 10^5.
> keep_barcodes = function(barcodes, cutoff) {
+ bc = subset(barcodes, count > cutoff)$barcode
+ return(gsub("-", ".", bc))
+ }
> k562_keep = keep_barcodes(K562_bc, 10^5)
> k562_grch37 = k562_grch37[, k562_keep]
> k562_refseq = k562_refseq[, k562_keep]
The H2-H3 samples have much less genes detected even when we drop the sequencing down of the K562 samples down to 60 million reads. The numbers of genes detected between the Ensembl and Refseq annotations look similar.
> k562_grch37_detected = data.frame(cell = colnames(k562_grch37), total = colSums(k562_grch37),
+ detected = colSums(k562_grch37 > 0), sample = "k562_grch37")
> k562_refseq_detected = data.frame(cell = colnames(k562_refseq), total = colSums(k562_refseq),
+ detected = colSums(k562_refseq > 0), sample = "k562_refseq")
> h1_detected = data.frame(cell = colnames(h1), total = colSums(h1), detected = colSums(h1 >
+ 0), sample = "h1")
> h2_detected = data.frame(cell = colnames(h2), total = colSums(h2), detected = colSums(h2 >
+ 0), sample = "h2")
> h3_detected = data.frame(cell = colnames(h3), total = colSums(h3), detected = colSums(h3 >
+ 0), sample = "h3")
> h1_refseq_detected = data.frame(cell = colnames(h1_refseq), total = colSums(h1_refseq),
+ detected = colSums(h1_refseq > 0), sample = "h1_refseq")
> h2_refseq_detected = data.frame(cell = colnames(h2_refseq), total = colSums(h2_refseq),
+ detected = colSums(h2_refseq > 0), sample = "h2_refseq")
> h3_refseq_detected = data.frame(cell = colnames(h3_refseq), total = colSums(h3_refseq),
+ detected = colSums(h3_refseq > 0), sample = "h3_refseq")
> detected = rbind(k562_grch37_detected, h1_detected, h2_detected, h3_detected,
+ k562_refseq_detected, h1_refseq_detected, h2_refseq_detected, h3_refseq_detected)
> detected = transform(detected, cell = reorder(cell, -detected))
>
> ggplot(detected, aes(sample, detected)) + geom_boxplot() + theme_bw()
Looking at histograms and the boxplots, we see about the same number of genes detected irregardless of what annotation we use.
> ggplot(detected, aes(reorder(cell, detected), detected)) + geom_bar(stat = "identity",
+ position = "dodge") + facet_wrap(~sample, scale = "free") + theme_bw() +
+ scale_x_discrete(breaks = NULL) + xlab("cell") + ylab("genes detected")
For the K562 samples, the RefSeq annotation captures more counts per cell. This isn’t true of the H1-H3 samples, though.
> ggplot(detected, aes(sample, total)) + geom_boxplot() + theme_bw() + ylab("total counts per cell") +
+ xlab("")
We can also try to measure the complexity of the library, looking at how many different genes are detected plotted against the total number of counts. Here we can see that these libraries have a fairly poor distribution of counts when using the Ensembl annotation for H1-H3. Using the RefSeq annotation, it looks much better. The Ensembl annotation has a bunch of non-coding RNA and non-polyA RNA annotated, much more than the RefSeq annotation which could be leading to the particularly poor quality using the Ensembl annotation. As a comparison, the K562 sample, however, looks fine using either the Ensembl or the RefSeq annotation.
We can see from these plots a poor sequence quality in the libraries, the K562 sample has a fairly steep change in genes detected with an increased total counts. The H1-H3 samples on the other hand have a much less steep increase, a piece of evidence that these libraries are less complex than the K562 lbraries. We are comparing across species here, so the comparison isn’t a great one.
> ggplot(detected, aes(total, detected, color = sample)) + geom_point() + scale_x_log10() +
+ facet_wrap(~sample) + theme_bw() + xlab("total counts") + ylab("genes detected")
Here we can see a steep dropoff for number of cells a gene is expressed in the H1-H3 samples, meaning there are fewer genes detected as expressed in every cell. The K562 samples, by comparison, have a much less sharp falloff. This indicates that there might be poor capture of RNA from the cells, leading to a less complex library.
> k562_grch37_detected = data.frame(gene = rownames(k562_grch37), total = rowSums(k562_grch37),
+ detected = rowSums(k562_grch37 > 0), sample = "k562_grch37")
> k562_refseq_detected = data.frame(gene = rownames(k562_refseq), total = rowSums(k562_refseq),
+ detected = rowSums(k562_refseq > 0), sample = "k562_refseq")
> h1_detected = data.frame(gene = rownames(h1), total = rowSums(h1), detected = rowSums(h1 >
+ 0), sample = "h1")
> h2_detected = data.frame(gene = rownames(h2), total = rowSums(h2), detected = rowSums(h2 >
+ 0), sample = "h2")
> h3_detected = data.frame(gene = rownames(h3), total = rowSums(h3), detected = rowSums(h3 >
+ 0), sample = "h3")
> h1_refseq_detected = data.frame(gene = rownames(h1_refseq), total = rowSums(h1_refseq),
+ detected = rowSums(h1_refseq > 0), sample = "h1_refseq")
> h2_refseq_detected = data.frame(gene = rownames(h2_refseq), total = rowSums(h2_refseq),
+ detected = rowSums(h2_refseq > 0), sample = "h2_refseq")
> h3_refseq_detected = data.frame(gene = rownames(h3_refseq), total = rowSums(h3_refseq),
+ detected = rowSums(h3_refseq > 0), sample = "h3_refseq")
> detected = rbind(k562_grch37_detected, h1_detected, h2_detected, h3_detected,
+ k562_refseq_detected, h1_refseq_detected, h2_refseq_detected, h3_refseq_detected)
> detected_df = detected
> detected = transform(detected, gene = reorder(gene, -detected))
> detected = subset(detected, total > 100)
> ggplot(detected, aes(reorder(gene, detected), detected)) + geom_bar(stat = "identity",
+ position = "dodge") + facet_wrap(~sample, scale = "free") + theme_bw() +
+ scale_x_discrete(breaks = NULL) + xlab("genes") + ylab("cells detected")
The H1 samples look pretty bad– it is interesting to see that there are sets of genes which are detected as expressed using the Ensembl annotation but not the RefSeq annotation in the H samples.
We can see the top 200 expressed genes have very variable expression across the cells. There are many cells where there is no expression; the K562 cells don’t have this issue. We can see there are probably cells in the H* samples that failed.
> top_200_heatmap = function(counts) {
+ counts = as.matrix(counts)
+ counts = head(counts[order(rowSums(counts), decreasing = TRUE), ], 200)
+ pheatmap(log(counts + 1), show_rownames = FALSE, show_colnames = FALSE)
+ }
> top_200_heatmap(k562_grch37)
> top_200_heatmap(k562_refseq)
> top_200_heatmap(h1)
> top_200_heatmap(h2)
> top_200_heatmap(h3)
> top_200_heatmap(h1_refseq)
> top_200_heatmap(h2_refseq)
> top_200_heatmap(h3_refseq)
We’ll drop the worst cells from the H* samples, where we define worst as the cells with < 500 genes detected. Using 1000 as the cutoff leaves us with tens of cells. This gets us down to hundreds.
> h1 = h1[, colSums(h1 > 0) > 500]
> h2 = h2[, colSums(h2 > 0) > 500]
> h3 = h3[, colSums(h3 > 0) > 500]
> h1_refseq = h1_refseq[, colSums(h1_refseq > 0) > 500]
> h2_refseq = h2_refseq[, colSums(h2_refseq > 0) > 500]
> h3_refseq = h3_refseq[, colSums(h3_refseq > 0) > 500]
This definitely cleaned up the heatmaps, they look much nicer now. Still pretty spotty expression considering these are the top expressed genes, though.
> top_200_heatmap(h1)
> top_200_heatmap(h2)
> top_200_heatmap(h3)
> top_200_heatmap(h1_refseq)
> top_200_heatmap(h2_refseq)
> top_200_heatmap(h3_refseq)
Here we take a shot at identifying the most variable genes to do PCA with, it is tough with the samples we have because they are very low counts.
> library(Seurat)
> k562_grch37.dat = k562_grch37
> colnames(k562_grch37.dat) = paste(colnames(k562_grch37.dat), "k562_grch37",
+ sep = "_")
> k562_grch37.seurat = new("seurat", raw.data = log(k562_grch37.dat + 1))
> k562_grch37.seurat = setup(k562_grch37.seurat, project = "k562_grch37", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> k562_grch37.seurat = mean.var.plot(k562_grch37.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h1.dat = h1
> colnames(h1.dat) = paste(colnames(h1.dat), "h1", sep = "_")
> h1.seurat = new("seurat", raw.data = log(h1.dat + 1))
> h1.seurat = setup(h1.seurat, project = "h1", min.cells = 3, min.genes = 500,
+ is.expr = 0.01, names.field = 2, names.delim = "_")
> h1.seurat = mean.var.plot(h1.seurat, y.cutoff = 2, do.plot = TRUE, x.low.cutoff = 0.25,
+ x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h2.dat = h2
> colnames(h2.dat) = paste(colnames(h2.dat), "h2", sep = "_")
> h2.seurat = new("seurat", raw.data = log(h2.dat + 1))
> h2.seurat = setup(h2.seurat, project = "h2", min.cells = 3, min.genes = 500,
+ is.expr = 0.01, names.field = 2, names.delim = "_")
> h2.seurat = mean.var.plot(h2.seurat, y.cutoff = 2, do.plot = TRUE, x.low.cutoff = 0.25,
+ x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h3.dat = h3
> colnames(h3.dat) = paste(colnames(h3.dat), "h3", sep = "_")
> h3.seurat = new("seurat", raw.data = log(h3.dat + 1))
> h3.seurat = setup(h3.seurat, project = "h3", min.cells = 3, min.genes = 500,
+ is.expr = 0.01, names.field = 2, names.delim = "_")
> h3.seurat = mean.var.plot(h3.seurat, y.cutoff = 2, do.plot = TRUE, x.low.cutoff = 0.25,
+ x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h1_refseq.dat = h1_refseq
> colnames(h1_refseq.dat) = paste(colnames(h1_refseq.dat), "h1_refseq", sep = "_")
> h1_refseq.seurat = new("seurat", raw.data = log(h1_refseq.dat + 1))
> h1_refseq.seurat = setup(h1_refseq.seurat, project = "h1_refseq", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h1_refseq.seurat = mean.var.plot(h1_refseq.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h2_refseq.dat = h2_refseq
> colnames(h2_refseq.dat) = paste(colnames(h2_refseq.dat), "h2_refseq", sep = "_")
> h2_refseq.seurat = new("seurat", raw.data = log(h2_refseq.dat + 1))
> h2_refseq.seurat = setup(h2_refseq.seurat, project = "h2_refseq", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h2_refseq.seurat = mean.var.plot(h2_refseq.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> h3_refseq.dat = h3_refseq
> colnames(h3_refseq.dat) = paste(colnames(h3_refseq.dat), "h3_refseq", sep = "_")
> h3_refseq.seurat = new("seurat", raw.data = log(h3_refseq.dat + 1))
> h3_refseq.seurat = setup(h3_refseq.seurat, project = "h3_refseq", min.cells = 3,
+ min.genes = 500, is.expr = 0.01, names.field = 2, names.delim = "_")
> h3_refseq.seurat = mean.var.plot(h3_refseq.seurat, y.cutoff = 2, do.plot = TRUE,
+ x.low.cutoff = 0.25, x.high.cutoff = 7, fxn.x = expMean, fxn.y = logVarDivMean)
> k562_grch37.seurat@var.genes
[1] "ABCE1" "ABHD10" "ACBD4" "AFAP1-AS1" "AGA"
[6] "AKT1" "ALS2" "ANGEL2" "AP3M1" "APOE"
[11] "ARMC7" "ASB7" "ATG2B" "ATG4B" "ATXN1"
[16] "ATXN7L1" "B3GNTL1" "BBX" "C10orf12" "C18orf54"
[21] "C19orf12" "C19orf48" "C20orf203" "C2orf88" "CCDC117"
[26] "CDADC1" "CDK19" "CDK5R1" "CEP164" "CETN2"
[31] "CETN3" "CHTF18" "COL1A2" "COLGALT2" "CREBL2"
[36] "CSTF1" "DCUN1D4" "DDIT3" "DKC1" "DNAJC27"
[41] "E2F1" "EEA1" "EEF2K" "EFTUD1" "EGLN3"
[46] "ELMO1" "ERLIN1" "EXTL2" "FAM171A1" "FAM222B"
[51] "FAM3C" "FBXO11" "FCGR2A" "FER" "FGD4"
[56] "FGFBP3" "GEMIN5" "GORASP2" "GPR183" "GPR63"
[61] "HBZ" "HCCS" "HDAC4" "HDHD1" "HIP1"
[66] "HIST1H1B" "HMGB3" "HOXB7" "HOXC6" "IGFBP4"
[71] "IP6K2" "IQGAP2" "ISPD-AS1" "JMY" "KIAA1715"
[76] "KLHL12" "KLHL26" "LYPLA1" "MED21" "MED9"
[81] "MIER1" "MKLN1" "MLEC" "MORF4L2" "MYADM"
[86] "NCK1" "NEMF" "NFIL3" "NUDT16P1" "NUP62"
[91] "ORMDL1" "PAGE1" "PGR" "PHOSPHO2" "PIM1"
[96] "PPP2R4" "PRNP" "PTK2B" "PTPN7" "PUF60"
[101] "R3HCC1L" "RAD23B" "RBM17" "RC3H1" "SF1"
[106] "SFMBT1" "SGOL2" "SGSH" "SKAP2" "SLC25A34"
[111] "SLC29A3" "SLC38A1" "SLC50A1" "SORCS1" "SOS2"
[116] "SSX2IP" "STAT6" "STK38" "STT3A" "SUV420H1"
[121] "TAL2" "TCF7L1" "TMCC1-AS1" "TMEM14A" "TMEM158"
[126] "TMEM74B" "TP73" "TRAF3" "TRIB3" "TRIP12"
[131] "TSPYL4" "UBOX5" "UGP2" "USP2-AS1" "USPL1"
[136] "UTP14A" "VPS36" "WDFY2" "WTAP" "XYLB"
[141] "ZBTB42" "ZCCHC7" "ZNF33A" "ZNF33B" "ZNF343"
[146] "ZNF35" "ZNF507" "ZNF638"
> h1.seurat@var.genes
[1] "6330416G13Rik" "Akt1" "Api5" "App"
[5] "Atf7" "Bcl11b" "Bivm" "Bms1"
[9] "Ccdc127" "Cdk19" "Cebpb" "Celsr2"
[13] "Cflar" "Clcn4-2" "Cxxc1" "Cyp4a12a"
[17] "D8Ertd82e" "Dapk1" "Ddx6" "Dennd6a"
[21] "Derl2" "Dmtn" "Dnajc9" "Dpm2"
[25] "Dync1li2" "Efr3a" "Eif2ak1" "Enpp1"
[29] "Fam163a" "Fbrsl1" "Fbxl20" "Fibp"
[33] "Fkbp1a" "Fmo5" "Fzr1" "Gab1"
[37] "Gatad2b" "Gbe1" "Gng12" "Hbs1l"
[41] "Hnrnpc" "Htatip2" "Ilf3" "Krit1"
[45] "Lhx1" "Lin7c" "Map2k7" "Mapt"
[49] "Mettl21b" "Mospd3" "Mpped2" "Myo1e"
[53] "Neo1" "Nolc1" "Nt5dc2" "Nucb1"
[57] "Pacs2" "Pan2" "Patl1" "Pcgf5"
[61] "Pex11b" "Pla2g12a" "Ppp2r2d" "Ptpn1"
[65] "Rbm4" "Rbpms" "Rexo2" "Rfx1"
[69] "Samd1" "Sec23b" "Shmt2" "Snx13"
[73] "Spin1" "Sptb" "Srsf7" "Stk24"
[77] "Syne4" "Taok2" "Tapbp" "Tbc1d5"
[81] "Tbl2" "Tcerg1" "Tcf25" "Terf2"
[85] "Tgfbr3" "Tjp2" "Trip10" "Ucp2"
[89] "Usp47" "Vps13d" "Zdhhc6" "Zfp131"
[93] "Zfp322a" "Zfp839"
> h2.seurat@var.genes
[1] "Aadat" "Abl1" "Actr1b" "Akr1c18" "Akt1" "Ano6"
[7] "Anxa5" "Apela" "Api5" "Aqp4" "Arid2" "Bcl2"
[13] "Cdkn1a" "Cdkn2c" "Cnot1" "Col27a1" "Copg1" "Cpsf1"
[19] "Cryab" "Csrnp3" "Cxxc1" "Cyp4a12a" "Dapk1" "Dcun1d3"
[25] "Dennd6a" "Derl2" "Dmtn" "Dnajc5" "Eif2ak1" "Eif4ebp2"
[31] "Elp3" "Ergic2" "Fam114a2" "Fbxl20" "Fndc3b" "G6pc3"
[37] "Glis1" "Gmppa" "Gng12" "Gpm6a" "Gsk3a" "Gtpbp1"
[43] "Hp1bp3" "Hsd17b14" "Hsd17b7" "Igf2" "Ilf3" "Irf6"
[49] "Itsn2" "Kat6b" "Kctd13" "Keap1" "Lrrc40" "Lztfl1"
[55] "Map2k7" "Map7" "Mdm4" "Mettl21b" "Mmab" "Mrrf"
[61] "Nsf" "Nsun4" "Nudcd1" "Pan2" "Patl1" "Pcsk7"
[67] "Pex11b" "Pnisr" "Polm" "Pphln1" "Ppp2r2d" "Ppp3cb"
[73] "Psat1" "Ptpdc1" "Pygo2" "Qprt" "Rbbp5" "Sae1"
[79] "Sec23b" "Sfmbt1" "Shmt2" "Slc12a2" "Slc35a5" "Snx10"
[85] "Snx15" "Sp1" "Sp2" "Spidr" "Spin1" "Sptb"
[91] "Tbc1d10b" "Tcf25" "Them7" "Tmem52b" "Tnrc6b" "Traf6"
[97] "Trip10" "Tsc2" "Ttc9c" "Ube2w" "Ugt2b38" "Utrn"
[103] "Zdhhc6" "Zfp131" "Zfp322a" "Zscan22" "Zwint"
> h3.seurat@var.genes
[1] "Aadat" "Abhd10" "Anxa4" "Ap1m1" "Api5" "Arid2" "Atat1"
[8] "Cast" "Cdk19" "Celsr2" "Cflar" "Chic2" "Clip4" "Clock"
[15] "Cyp4a31" "Derl2" "Dmtn" "Dnm3" "Egf" "Ephx2" "Fam63a"
[22] "Fbrsl1" "Fign" "Fkbp1a" "Gab1" "Gpm6a" "Hnrnpc" "Kap"
[29] "Kat2b" "Lurap1l" "Map7" "Mavs" "Mcmbp" "Mmd" "Mrrf"
[36] "Nolc1" "Nono" "Nt5dc3" "Nucb1" "Osr2" "Pacs2" "Papd5"
[43] "Pnkd" "Ptpn1" "Pygo2" "Rab18" "Rbbp5" "Rbm15b" "Rnf152"
[50] "Sec23b" "Shmt2" "Slc12a1" "Spin1" "Stk24" "Stx12" "Syngr1"
[57] "Tfcp2" "Tmem52b" "Uba1" "Ubl4" "Umod" "Wfdc15b" "Zcchc3"
[64] "Zdhhc6" "Zfp963"
> h1_refseq.seurat@var.genes
[1] "6330416G13Rik" "Adipor2" "Ascc1" "Atg13"
[5] "Atn1" "Bcl11b" "Bivm" "Capns1"
[9] "Cdr2" "Clip1" "Crtap" "Ctns"
[13] "Cxxc1" "Cyp4a12a" "Cyp4a12b" "D8Ertd82e"
[17] "Dctn1" "Ddx58" "Dnajc9" "Dpm2"
[21] "Dync1li2" "Efr3a" "Eif2ak1" "Eml3"
[25] "Ezh1" "Faah" "Fam163a" "Fam89b"
[29] "Fbrsl1" "Fitm1" "Fundc2" "Fzr1"
[33] "Gas1" "Gmppa" "Gstt3" "Gtpbp1"
[37] "Irf2bp1" "Lage3" "Lin7c" "Mpped2"
[41] "Myo1e" "Nras" "Nt5dc2" "Nub1"
[45] "Patl1" "Pax2" "Pax8" "Pcgf5"
[49] "Pgm2" "Pgpep1" "Phf2" "Ppp2r2d"
[53] "Prlr" "Psmg2" "Ptpn1" "Rbbp5"
[57] "Rbm4" "Rbpms" "Rexo2" "Rnf152"
[61] "Samd1" "Sec31a" "Snx13" "Sp1"
[65] "Stk24" "Stx12" "Taok2" "Tbl2"
[69] "Tecr" "Tmem135" "Tnks1bp1" "Tomm34"
[73] "Trappc1" "Trappc9" "U2af2" "Ucp2"
[77] "Zdhhc6" "Zmym4" "Zwint"
> h2_refseq.seurat@var.genes
[1] "Aadat" "Abhd4" "Akr1c18" "Anxa5" "Apela" "Apex1"
[7] "Apex2" "Arnt2" "Capns1" "Col27a1" "Ctnnd1" "Cxxc1"
[13] "Cyp2e1" "Cyp4a12a" "Ddah1" "Ddx58" "Dpm2" "Efhd1"
[19] "Efr3a" "Eif2ak1" "Eif4ebp2" "Fbxl20" "Fndc3b" "Fundc2"
[25] "G6pc3" "Gas1" "Glis1" "Gmppa" "Gpatch8" "Gtpbp1"
[31] "Hsd17b14" "Itgb1" "Jun" "Kctd13" "Klf3" "Max"
[37] "Mrrf" "Napsa" "Nsf" "Nsun4" "Olfml1" "Pak1"
[43] "Patl1" "Pbx1" "Peli1" "Polm" "Psat1" "Rab12"
[49] "Rab4b" "Rqcd1" "Scd1" "Sepp1" "Slc12a2" "Slc22a2"
[55] "Slc25a45" "Slc35a3" "Snx15" "Sp1" "Tbc1d10b" "Tbp"
[61] "Tmem52b" "Tnks1bp1" "Ttc9c" "U2af2" "Ubap2" "Ube4b"
[67] "Ucp2" "Ugt2b38" "Usp14" "Ypel2" "Zdhhc6" "Zwint"
> h3_refseq.seurat@var.genes
[1] "Aadat" "Anxa4" "Ap1m1" "Apex1" "Arid2"
[6] "Atat1" "Atn1" "Capns1" "Ccnk" "Cldn19"
[11] "Cxxc1" "Cyp4a31" "Dctn1" "Derl2" "Dnase1"
[16] "Efr3a" "Egf" "Eml3" "Epm2aip1" "Exoc8"
[21] "Ezh1" "Fam210b" "Fam63a" "Fbrsl1" "Fbxo28"
[26] "Fitm1" "Gbas" "Gbe1" "Gmppa" "Gstt3"
[31] "Hs1bp3" "Igf1r" "Kap" "Kdm6b" "Kif1b"
[36] "Kmt2b" "Lurap1l" "Max" "Mpv17l" "Mt1"
[41] "N4bp1" "Npr2" "Nt5dc3" "Pak1ip1" "Patl1"
[46] "Pnkd" "Ptms" "Ptpn1" "Rbbp5" "Reep6"
[51] "Rexo1" "Rexo2" "Rnf152" "Rsu1" "Scamp4"
[56] "Sdad1" "Sdha" "Serpina1f" "Slc12a1" "Slc51a"
[61] "Snx13" "Sp1" "Srsf10" "Stk24" "Syngr1"
[66] "Tbc1d10b" "Tm9sf1" "Tmem52b" "Tnks1bp1" "Tpm4"
[71] "U2af2" "Ucp2" "Vps4a" "Wfdc15b" "Zcchc3"
[76] "Zdhhc6" "Zfp963"
Just to see, even though the variable genes identified looks wonky, in these samples, we ran PCA on all of the samples to see if any groups pop out.
> k562_grch37.seurat = pca(k562_grch37.seurat, pcs.print = 3, genes.print = 5,
+ do.print = FALSE)
> pca.plot(k562_grch37.seurat, pt.size = 2)
> h1.seurat = pca(h1.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h1.seurat, pt.size = 2)
> h2.seurat = pca(h2.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h2.seurat, pt.size = 2)
> h3.seurat = pca(h3.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h3.seurat, pt.size = 2)
> h1_refseq.seurat = pca(h1_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h1_refseq.seurat, pt.size = 2)
> h2_refseq.seurat = pca(h2_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h2_refseq.seurat, pt.size = 2)
> h3_refseq.seurat = pca(h3_refseq.seurat, pcs.print = 3, genes.print = 5, do.print = FALSE)
> pca.plot(h3_refseq.seurat, pt.size = 2)
I pulled a set of housekeeping genes from Evidence Based Selection of Housekeeping Genes and converted them to human and mouse symbols. Then I made heatmaps for all of the housekeeping genes. We can see that for the K562 sample, most of the housekeeping genes are expressed strongly whereas in the H1-H3 samples, there is spare expression of the housekeeping genes.
> hk = as.character(read.table("metadata/hk.txt", header = FALSE)[, "V1"])
> hk = c(hk, c("Actb", "Gapdh"))
> human_hk = as.character(read.table("metadata/human-hk.txt", header = FALSE)[,
+ "V1"])
> human_hk = c(human_hk, c("Actb", "Gapdh"))
> housekeeping_heatmap = function(counts, hk) {
+ counts = counts[rownames(counts) %in% hk, ]
+ pheatmap(log(counts + 1), show_rownames = TRUE, show_colnames = FALSE)
+ }
> housekeeping_heatmap(k562_grch37, human_hk)
> housekeeping_heatmap(k562_refseq, human_hk)
> housekeeping_heatmap(h1, hk)
> housekeeping_heatmap(h2, hk)
> housekeeping_heatmap(h3, hk)
> housekeeping_heatmap(h1_refseq, hk)
> housekeeping_heatmap(h2_refseq, hk)
> housekeeping_heatmap(h3_refseq, hk)
It is hard to say what might be the cause of the problems we identified. If we could see the characteristic dip in the barcode distribution like we did for K562, I would feel pretty confident in recomending sequencing these samples some more. Since we do not see that dip, I’m not sure what to think.
I subsampled the K562 data and reran it, the distribution looks exactly the same and has the dip, so it isn’t due to the depth of sequencing.